In [2]:
import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from tester import test_classifier, dump_classifier_and_data
from sklearn import ensemble
from sklearn.ensemble import AdaBoostClassifier
from sklearn.grid_search import GridSearchCV

In [3]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary','deferral_payments', 'total_payments', 'loan_advances',
                 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 
                 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 
                 'restricted_stock', 'director_fees', 'to_messages', 'from_poi_to_this_person', 
                 'from_messages', 'from_this_person_to_poi', 
                 'shared_receipt_with_poi'] # You will need to use more features

In [4]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [5]:
# Let look at some dataset statistcs
print "# of records : ", len(data_dict)


# of records :  146

In [6]:
# POIs vs Non POIs
non_poi_count = 0
for p in data_dict.values():
    if p['poi']:
        non_poi_count += 1
print "# of POIs: ", non_poi_count
print "# of Non POIs: ", len(data_dict) - non_poi_count


# of POIs:  18
# of Non POIs:  128

In [7]:
# Missing values in feautres
print "# of missing values in features: "
NaNInFeatures = [0 for i in range(len(features_list))]
for i, person in enumerate(data_dict.values()):
    for j, feature in enumerate(features_list):
        if person[feature] == 'NaN':
            NaNInFeatures[j] += 1

for i, feature in enumerate(features_list):
    print feature, NaNInFeatures[i]


# of missing values in features: 
poi 0
salary 51
deferral_payments 107
total_payments 21
loan_advances 142
bonus 64
restricted_stock_deferred 128
deferred_income 97
total_stock_value 20
expenses 51
exercised_stock_options 44
other 53
long_term_incentive 80
restricted_stock 36
director_fees 129
to_messages 60
from_poi_to_this_person 60
from_messages 60
from_this_person_to_poi 60
shared_receipt_with_poi 60

In [9]:
### Task 2: Remove outliers

# Let's look at the names.
s = []
for person in data_dict.keys():
    s.append(person)
    if len(s) == 4:
        print '{:<30}{:<30}{:<30}{:<30}'.format(s[0],s[1],s[2],s[3])
        s = []
print '{:<30}{:<30}'.format(s[0],s[1])


METTS MARK                    BAXTER JOHN C                 ELLIOTT STEVEN                CORDES WILLIAM R              
HANNON KEVIN P                MORDAUNT KRISTINA M           MEYER ROCKFORD G              MCMAHON JEFFREY               
HORTON STANLEY C              PIPER GREGORY F               HUMPHREY GENE E               UMANOFF ADAM S                
BLACHMAN JEREMY M             SUNDE MARTIN                  GIBBS DANA R                  LOWRY CHARLES P               
COLWELL WESLEY                MULLER MARK S                 JACKSON CHARLENE R            WESTFAHL RICHARD K            
WALTERS GARETH W              WALLS JR ROBERT H             KITCHEN LOUISE                CHAN RONNIE                   
BELFER ROBERT                 SHANKMAN JEFFREY A            WODRASKA JOHN                 BERGSIEKER RICHARD P          
URQUHART JOHN A               BIBI PHILIPPE A               RIEKER PAULA H                WHALEY DAVID A                
BECK SALLY W                  HAUG DAVID L                  ECHOLS JOHN B                 MENDELSOHN JOHN               
HICKERSON GARY J              CLINE KENNETH W               LEWIS RICHARD                 HAYES ROBERT E                
MCCARTY DANNY J               KOPPER MICHAEL J              LEFF DANIEL P                 LAVORATO JOHN J               
BERBERIAN DAVID               DETMERING TIMOTHY J           WAKEHAM JOHN                  POWERS WILLIAM                
GOLD JOSEPH                   BANNANTINE JAMES M            DUNCAN JOHN H                 SHAPIRO RICHARD S             
SHERRIFF JOHN R               SHELBY REX                    LEMAISTRE CHARLES             DEFFNER JOSEPH M              
KISHKILL JOSEPH G             WHALLEY LAWRENCE G            MCCONNELL MICHAEL S           PIRO JIM                      
DELAINEY DAVID W              SULLIVAN-SHAKLOVITZ COLLEEN   WROBEL BRUCE                  LINDHOLM TOD A                
MEYER JEROME J                LAY KENNETH L                 BUTTS ROBERT H                OLSON CINDY K                 
MCDONALD REBECCA              CUMBERLAND MICHAEL S          GAHN ROBERT S                 MCCLELLAN GEORGE              
HERMANN ROBERT J              SCRIMSHAW MATTHEW             GATHMANN WILLIAM D            HAEDICKE MARK E               
BOWEN JR RAYMOND M            GILLIS JOHN                   FITZGERALD JAY L              MORAN MICHAEL P               
REDMOND BRIAN L               BAZELIDES PHILIP J            BELDEN TIMOTHY N              DURAN WILLIAM D               
THORN TERENCE H               FASTOW ANDREW S               FOY JOE                       CALGER CHRISTOPHER F          
RICE KENNETH D                KAMINSKI WINCENTY J           LOCKHART EUGENE E             COX DAVID                     
OVERDYKE JR JERE C            PEREIRA PAULO V. FERRAZ       STABLER FRANK                 SKILLING JEFFREY K            
BLAKE JR. NORMAN P            SHERRICK JEFFREY B            PRENTICE JAMES                GRAY RODNEY                   
PICKERING MARK R              THE TRAVEL AGENCY IN THE PARK NOLES JAMES L                 KEAN STEVEN J                 
TOTAL                         FOWLER PEGGY                  WASAFF GEORGE                 WHITE JR THOMAS E             
CHRISTODOULOU DIOMEDES        ALLEN PHILLIP K               SHARP VICTORIA T              JAEDICKE ROBERT               
WINOKUR JR. HERBERT S         BROWN MICHAEL                 BADUM JAMES P                 HUGHES JAMES A                
REYNOLDS LAWRENCE             DIMICHELE RICHARD G           BHATNAGAR SANJAY              CARTER REBECCA C              
BUCHANAN HAROLD G             YEAP SOON                     MURRAY JULIA H                GARLAND C KEVIN               
DODSON KEITH                  YEAGER F SCOTT                HIRKO JOSEPH                  DIETRICH JANET R              
DERRICK JR. JAMES V           FREVERT MARK A                PAI LOU L                     BAY FRANKLIN R                
HAYSLETT RODERICK J           FUGH JOHN L                   FALLON JAMES B                KOENIG MARK E                 
SAVAGE FRANK                  IZZO LAWRENCE L               TILNEY ELIZABETH A            MARTIN AMANDA K               
BUY RICHARD B                 GRAMM WENDY L                 CAUSEY RICHARD A              TAYLOR MITCHELL S             
DONAHUE JR JEFFREY M          GLISAN JR BEN F               

We see above that there is a entry called "TOTAL". That obvisously cannot be a name. We would need to remove that from the dataset. Before we do, let's confirm it is what it the name suggests it is.


In [10]:
print "print out some values of the observation 'TOTAL'"
for name, person in data_dict.iteritems():
	if name == 'TOTAL':
		print person


print out some values of the observation 'TOTAL'
{'salary': 26704229, 'to_messages': 'NaN', 'deferral_payments': 32083396, 'total_payments': 309886585, 'exercised_stock_options': 311764000, 'bonus': 97343619, 'restricted_stock': 130322299, 'shared_receipt_with_poi': 'NaN', 'restricted_stock_deferred': -7576788, 'total_stock_value': 434509511, 'expenses': 5235198, 'loan_advances': 83925000, 'from_messages': 'NaN', 'other': 42667589, 'from_this_person_to_poi': 'NaN', 'poi': False, 'director_fees': 1398517, 'deferred_income': -27992891, 'long_term_incentive': 48521928, 'email_address': 'NaN', 'from_poi_to_this_person': 'NaN'}

In [11]:
salary  = []
for name, person in data_dict.iteritems():
    if float(person['salary']) > 0:
        salary.append(float(person['salary']))
print "the sum of salary of all other persons is: ",np.sum(salary)/2


the sum of salary of all other persons is:  26704229.0

We see that the total salary matches to the salary against the "TOTAL" record in the dataset.


In [12]:
# Let's remove this TOTAL record.
data_dict.pop('TOTAL')


Out[12]:
{'bonus': 97343619,
 'deferral_payments': 32083396,
 'deferred_income': -27992891,
 'director_fees': 1398517,
 'email_address': 'NaN',
 'exercised_stock_options': 311764000,
 'expenses': 5235198,
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 83925000,
 'long_term_incentive': 48521928,
 'other': 42667589,
 'poi': False,
 'restricted_stock': 130322299,
 'restricted_stock_deferred': -7576788,
 'salary': 26704229,
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 309886585,
 'total_stock_value': 434509511}

In [13]:
# There is a also a record which belongs to "THE TRAVEL AGENCY IN THE PARK". 
# This is not a person and hence should be removed.
data_dict.pop("THE TRAVEL AGENCY IN THE PARK")


Out[13]:
{'bonus': 'NaN',
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'NaN',
 'exercised_stock_options': 'NaN',
 'expenses': 'NaN',
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 362096,
 'poi': False,
 'restricted_stock': 'NaN',
 'restricted_stock_deferred': 'NaN',
 'salary': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 362096,
 'total_stock_value': 'NaN'}

In [15]:
# No of records after removal of TOTAL & THE TRAVEL AGENCY IN THE PARK
print "No of records after removal of TOTAL: ", len(data_dict)


No of records after removal of TOTAL:  144

In [16]:
### Task 3: Create new feature(s)

### Store to my_dataset for easy export below.
my_dataset = data_dict

print "we create two new features here 'to_poi_message_ratio' and 'from_poi_message_ratio' "
for person in my_dataset.values():
    person['to_poi_message_ratio'] = 0
    person['from_poi_message_ratio'] = 0
    if float(person['from_messages']) > 0:
        person['to_poi_message_ratio'] = float(person['from_this_person_to_poi'])/float(person['from_messages'])
    if float(person['to_messages']) > 0:
        person['from_poi_message_ratio'] = float(person['from_poi_to_this_person'])/float(person['to_messages'])
    
features_list.extend(['to_poi_message_ratio', 'from_poi_message_ratio'])


we create two new features here 'to_poi_message_ratio' and 'from_poi_message_ratio' 

In [17]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list)
labels, features = targetFeatureSplit(data)

In [31]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

clf = DecisionTreeClassifier(min_samples_split=6, random_state=10)
test_classifier(clf, my_dataset, features_list)

#clf = ensemble.RandomForestClassifier(criterion='gini', n_estimators=14, max_depth=7,
#                                      max_features=None, random_state=42, min_samples_split=1)
#clf = AdaBoostClassifier(algorithm='SAMME')

#params = dict(reduce_dim__n_components=[1, 2, 3], tree__min_samples_split=[2, 4, 6, 8 10])
#clf = GridSearchCV(clf, param_grid=params, n_jobs=-1, scoring='recall')

#test_classifier(clf, my_dataset, features_list)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=6, min_weight_fraction_leaf=0.0,
            presort=False, random_state=10, splitter='best')
	Accuracy: 0.82573	Precision: 0.32356	Recall: 0.28150	F1: 0.30107	F2: 0.28901
	Total predictions: 15000	True positives:  563	False positives: 1177	False negatives: 1437	True negatives: 11823


In [67]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [ ]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)